plasma = read.table("Plasma.txt", header = T)
sapply(plasma, class)
## AGE SEX SMOKSTAT QUETELET VITUSE CALORIES
## "integer" "character" "character" "numeric" "character" "numeric"
## FAT FIBER ALCOHOL CHOLESTEROL BETADIET RETDIET
## "numeric" "numeric" "numeric" "numeric" "integer" "integer"
## BETAPLASMA RETPLASMA
## "integer" "integer"
sapply(plasma, function(x) sum(is.na(x)))
## AGE SEX SMOKSTAT QUETELET VITUSE CALORIES
## 0 0 0 0 0 0
## FAT FIBER ALCOHOL CHOLESTEROL BETADIET RETDIET
## 0 0 0 0 0 0
## BETAPLASMA RETPLASMA
## 0 0
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
quant_plasma <- plasma %>%
select_if(function(col) is.integer(col) |
is.numeric(col))
summary(quant_plasma)
## AGE QUETELET CALORIES FAT
## Min. :19.00 Min. :16.33 Min. : 445.2 Min. : 14.40
## 1st Qu.:39.00 1st Qu.:21.80 1st Qu.:1338.0 1st Qu.: 53.95
## Median :48.00 Median :24.74 Median :1666.8 Median : 72.90
## Mean :50.15 Mean :26.16 Mean :1796.7 Mean : 77.03
## 3rd Qu.:62.50 3rd Qu.:28.85 3rd Qu.:2100.4 3rd Qu.: 95.25
## Max. :83.00 Max. :50.40 Max. :6662.2 Max. :235.90
## FIBER ALCOHOL CHOLESTEROL BETADIET
## Min. : 3.10 Min. : 0.000 Min. : 37.7 Min. : 214
## 1st Qu.: 9.15 1st Qu.: 0.000 1st Qu.:155.0 1st Qu.:1116
## Median :12.10 Median : 0.300 Median :206.3 Median :1802
## Mean :12.79 Mean : 3.279 Mean :242.5 Mean :2186
## 3rd Qu.:15.60 3rd Qu.: 3.200 3rd Qu.:308.9 3rd Qu.:2836
## Max. :36.80 Max. :203.000 Max. :900.7 Max. :9642
## RETDIET BETAPLASMA RETPLASMA
## Min. : 30.0 Min. : 0.0 Min. : 179.0
## 1st Qu.: 480.0 1st Qu.: 90.0 1st Qu.: 466.0
## Median : 707.0 Median : 140.0 Median : 566.0
## Mean : 832.7 Mean : 189.9 Mean : 602.8
## 3rd Qu.:1037.0 3rd Qu.: 230.0 3rd Qu.: 716.0
## Max. :6901.0 Max. :1415.0 Max. :1727.0
par(mfrow = c(2, 2))
for(i in 1:11){
hist(quant_plasma[, i], main=paste("Histogram of", names(quant_plasma)[i]))
}
par(mfrow=c(1,3))
for (i in 1:length(quant_plasma)) {
boxplot(quant_plasma[,i], main=names(quant_plasma[i]), type="l")
}
panel.cor <- function(x, y) {
par(usr = c(0, 1, 0, 1))
r <- round(cor(x, y, use = "complete.obs"), 2)
txt <- paste0("R = ", r)
cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * (abs(r) + 1))
}
pairs(quant_plasma, lower.panel = panel.cor)
### scatter plot matrix without extreme cases (with correlation)
rm1 <- which(quant_plasma$ALCOHOL == max(quant_plasma$ALCOHOL))
quant_plasma_rm <- quant_plasma[-rm1,]
rm2 <- which(quant_plasma_rm$RETDIET == max(quant_plasma_rm$RETDIET))
quant_plasma_rm <- quant_plasma_rm[-rm2,]
panel.cor <- function(x, y) {
par(usr = c(0, 1, 0, 1))
r <- round(cor(x, y, use = "complete.obs"), 2)
txt <- paste0("R = ", r)
cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * (abs(r) + 1))
}
pairs(quant_plasma_rm, lower.panel = panel.cor)
cor(quant_plasma)
## AGE QUETELET CALORIES FAT FIBER
## AGE 1.000000000 -0.017463821 -0.176769399 -0.16947981 0.04485175
## QUETELET -0.017463821 1.000000000 0.003526964 0.04875033 -0.08762333
## CALORIES -0.176769399 0.003526964 1.000000000 0.87184150 0.46548077
## FAT -0.169479813 0.048750329 0.871841504 1.00000000 0.27648356
## FIBER 0.044851750 -0.087623326 0.465480772 0.27648356 1.00000000
## ALCOHOL 0.051583268 -0.072695431 0.451469804 0.18574310 -0.02011748
## CHOLESTEROL -0.113605966 0.110257245 0.659175452 0.70984794 0.15396838
## BETADIET 0.071869247 -0.006603005 0.243376823 0.14342785 0.48264371
## RETDIET -0.009610795 0.032055785 0.402491661 0.41221481 0.21461162
## BETAPLASMA 0.101127650 -0.229387366 -0.022206957 -0.09164659 0.23595358
## RETPLASMA 0.211671702 0.013138652 -0.073328605 -0.09093779 -0.04443071
## ALCOHOL CHOLESTEROL BETADIET RETDIET BETAPLASMA
## AGE 0.05158327 -0.11360597 0.071869247 -0.009610795 0.10112765
## QUETELET -0.07269543 0.11025724 -0.006603005 0.032055785 -0.22938737
## CALORIES 0.45146980 0.65917545 0.243376823 0.402491661 -0.02220696
## FAT 0.18574310 0.70984794 0.143427853 0.412214814 -0.09164659
## FIBER -0.02011748 0.15396838 0.482643706 0.214611616 0.23595358
## ALCOHOL 1.00000000 0.18226398 0.039426478 0.044946841 -0.02221084
## CHOLESTEROL 0.18226398 1.00000000 0.115634801 0.443439304 -0.13030501
## BETADIET 0.03942648 0.11563480 1.000000000 0.052866904 0.22477951
## RETDIET 0.04494684 0.44343930 0.052866904 1.000000000 -0.04613524
## BETAPLASMA -0.02221084 -0.13030501 0.224779514 -0.046135240 1.00000000
## RETPLASMA 0.01713633 -0.07020134 -0.013539420 -0.062802201 0.07157724
## RETPLASMA
## AGE 0.21167170
## QUETELET 0.01313865
## CALORIES -0.07332861
## FAT -0.09093779
## FIBER -0.04443071
## ALCOHOL 0.01713633
## CHOLESTEROL -0.07020134
## BETADIET -0.01353942
## RETDIET -0.06280220
## BETAPLASMA 0.07157724
## RETPLASMA 1.00000000
r <- cor(quant_plasma)["BETAPLASMA","RETPLASMA"]
r
## [1] 0.07157724
n.q <- nrow(quant_plasma)
#Hypothesis testing for correlation among betaplasma and retplasma
#Null hypothesis : H0: r = 0
#Alternative hypothesis: H1: r not equal to 0
t_statistic <- (r*sqrt(n.q-2))/(sqrt(1-r^2))
t_value <- qt(1-0.05/2, n.q-2)
t_statistic > t_value
## [1] FALSE
#Discision Rule: when t_statistics > t_value = 1.967572, we reject null hypothesis.
#Conclusion: there is not enough evidence to conclude that under significance level 0.05 there is a linear relationship between BETAPLASMA and RETPLASMA
as.data.frame(cor(quant_plasma))
## AGE QUETELET CALORIES FAT FIBER
## AGE 1.000000000 -0.017463821 -0.176769399 -0.16947981 0.04485175
## QUETELET -0.017463821 1.000000000 0.003526964 0.04875033 -0.08762333
## CALORIES -0.176769399 0.003526964 1.000000000 0.87184150 0.46548077
## FAT -0.169479813 0.048750329 0.871841504 1.00000000 0.27648356
## FIBER 0.044851750 -0.087623326 0.465480772 0.27648356 1.00000000
## ALCOHOL 0.051583268 -0.072695431 0.451469804 0.18574310 -0.02011748
## CHOLESTEROL -0.113605966 0.110257245 0.659175452 0.70984794 0.15396838
## BETADIET 0.071869247 -0.006603005 0.243376823 0.14342785 0.48264371
## RETDIET -0.009610795 0.032055785 0.402491661 0.41221481 0.21461162
## BETAPLASMA 0.101127650 -0.229387366 -0.022206957 -0.09164659 0.23595358
## RETPLASMA 0.211671702 0.013138652 -0.073328605 -0.09093779 -0.04443071
## ALCOHOL CHOLESTEROL BETADIET RETDIET BETAPLASMA
## AGE 0.05158327 -0.11360597 0.071869247 -0.009610795 0.10112765
## QUETELET -0.07269543 0.11025724 -0.006603005 0.032055785 -0.22938737
## CALORIES 0.45146980 0.65917545 0.243376823 0.402491661 -0.02220696
## FAT 0.18574310 0.70984794 0.143427853 0.412214814 -0.09164659
## FIBER -0.02011748 0.15396838 0.482643706 0.214611616 0.23595358
## ALCOHOL 1.00000000 0.18226398 0.039426478 0.044946841 -0.02221084
## CHOLESTEROL 0.18226398 1.00000000 0.115634801 0.443439304 -0.13030501
## BETADIET 0.03942648 0.11563480 1.000000000 0.052866904 0.22477951
## RETDIET 0.04494684 0.44343930 0.052866904 1.000000000 -0.04613524
## BETAPLASMA -0.02221084 -0.13030501 0.224779514 -0.046135240 1.00000000
## RETPLASMA 0.01713633 -0.07020134 -0.013539420 -0.062802201 0.07157724
## RETPLASMA
## AGE 0.21167170
## QUETELET 0.01313865
## CALORIES -0.07332861
## FAT -0.09093779
## FIBER -0.04443071
## ALCOHOL 0.01713633
## CHOLESTEROL -0.07020134
## BETADIET -0.01353942
## RETDIET -0.06280220
## BETAPLASMA 0.07157724
## RETPLASMA 1.00000000
plasma$SEX <- as.factor(plasma$SEX)
plasma$SMOKSTAT <- as.factor(plasma$SMOKSTAT)
plasma$VITUSE <- as.factor(plasma$VITUSE)
sapply(plasma, class)
## AGE SEX SMOKSTAT QUETELET VITUSE CALORIES
## "integer" "factor" "factor" "numeric" "factor" "numeric"
## FAT FIBER ALCOHOL CHOLESTEROL BETADIET RETDIET
## "numeric" "numeric" "numeric" "numeric" "integer" "integer"
## BETAPLASMA RETPLASMA
## "integer" "integer"
levels(plasma$SEX)
## [1] "FEMALE" "MALE"
levels(plasma$SMOKSTAT)
## [1] "CURRENT" "FORMER" "NEVER"
levels(plasma$VITUSE)
## [1] "NO" "NOT OFTEN" "OFTEN"
par(mfrow = c(2,2))
table.sex <- table(plasma$SEX)
table.smok <- table(plasma$SMOKSTAT)
table.vit <- table(plasma$VITUSE)
barplot(table.sex, main = "Sex: bar chart")
barplot(table.smok, main = "Smokestat: bar chart")
barplot(table.vit, main = "Vituse: bar chart")
par(mfrow = c(2,2))
level.sex <- c("female", "male")
pct.sex <- round(table.sex/nrow(plasma)*100)
label.sex <- paste(paste(level.sex, pct.sex),"%")
pie(table.sex, labels = label.sex, col = c(1,2), main = "Sex: pie chart with percentage")
level.smok <- c("current", "former", "never")
pct.smok <- round(table.smok/nrow(plasma)*100)
label.smok <- paste(paste(level.smok, pct.smok),"%")
pie(table.smok, labels = label.smok, col = c(4,5,6), main = "Smokstat: pie chart with percentage")
level.vit <- c("no", "not often", "often")
pct.vit <- round(table.vit/nrow(plasma)*100)
label.vit <- paste(paste(level.vit, pct.vit),"%")
pie(table.vit, labels = label.vit, col = c(7,8,9), main = "Vituse: pie chart with percentage")
par(mfrow = c(2,3))
ctgr.name <- c("SEX","SMOKSTAT","VITUSE")
response.name <- c("BETAPLASMA", "RETPLASMA")
for (i in ctgr.name){
for (j in response.name){
m <- paste(j,":","side-by-side box plot by",i,"level")
boxplot(plasma[,j]~plasma[,i], main = m, xlab = i, ylab = j)
}
}
rm <- which(plasma$BETAPLASMA <= 0) #only 1 observation
plasma_rm <- plasma[-rm,] #Boxcox procedure can only deal with positive response variable
beta_plasma_rm <- plasma_rm[,-14]
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# ?if we should put subjective model here
fit.pre <- lm(BETAPLASMA~., data = beta_plasma_rm)
summary(fit.pre)
##
## Call:
## lm(formula = BETAPLASMA ~ ., data = beta_plasma_rm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -257.90 -88.68 -28.05 38.53 1071.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 181.639774 68.252089 2.661 0.008204 **
## AGE 0.718567 0.748057 0.961 0.337541
## SEXMALE -33.852475 31.807026 -1.064 0.288048
## SMOKSTATFORMER 41.532555 31.195470 1.331 0.184083
## SMOKSTATNEVER 47.547121 30.976114 1.535 0.125851
## QUETELET -5.929232 1.632950 -3.631 0.000332 ***
## VITUSENOT OFTEN 43.916308 25.323858 1.734 0.083916 .
## VITUSEOFTEN 79.267745 23.245584 3.410 0.000739 ***
## CALORIES -0.030255 0.051274 -0.590 0.555600
## FAT 0.101812 0.805645 0.126 0.899522
## FIBER 6.378290 2.830550 2.253 0.024960 *
## ALCOHOL 0.981143 1.242056 0.790 0.430192
## CHOLESTEROL -0.064525 0.113552 -0.568 0.570297
## BETADIET 0.015840 0.007510 2.109 0.035755 *
## RETDIET -0.007303 0.018724 -0.390 0.696784
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 168.2 on 299 degrees of freedom
## Multiple R-squared: 0.1928, Adjusted R-squared: 0.155
## F-statistic: 5.1 on 14 and 299 DF, p-value: 1.363e-08
par(mfrow=c(2,2))
plot(fit.pre, which=c(1,2))
boxcox(fit.pre)#lambda = 0,indicating log transformation
hist(log(plasma$BETAPLASMA), xlab ="log(BETAPLASMA)", main = "Histogram of log(BETAPLASMA)")
fit.pre.trans <- lm(log(BETAPLASMA)~., data = beta_plasma_rm) #try log transformation on BETAPLASMA
plot(fit.pre.trans, which=c(1,2))
set.seed(1024)
beta_plasma <- plasma[,-14]
index <- sample(1:315, ceiling(315*0.8), replace = FALSE)
beta_plasma_t <- beta_plasma[index,] #training data
beta_plasma_v <- beta_plasma[-index,] #validation data
beta_plasma_t_full<- lm(log(BETAPLASMA+1)~., data=beta_plasma_t)
summary(beta_plasma_t_full)
##
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ ., data = beta_plasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05512 -0.36039 0.03189 0.38717 1.80650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.215e+00 2.819e-01 18.499 < 2e-16 ***
## AGE 4.355e-03 3.170e-03 1.374 0.17076
## SEXMALE -3.120e-01 1.349e-01 -2.313 0.02160 *
## SMOKSTATFORMER 1.274e-01 1.325e-01 0.962 0.33720
## SMOKSTATNEVER 2.526e-01 1.299e-01 1.945 0.05296 .
## QUETELET -3.209e-02 6.864e-03 -4.675 4.95e-06 ***
## VITUSENOT OFTEN 2.611e-01 1.089e-01 2.397 0.01731 *
## VITUSEOFTEN 2.981e-01 9.890e-02 3.015 0.00285 **
## CALORIES -1.613e-04 2.158e-04 -0.747 0.45552
## FAT -9.856e-04 3.416e-03 -0.289 0.77322
## FIBER 1.720e-02 1.211e-02 1.420 0.15695
## ALCOHOL 4.347e-03 5.108e-03 0.851 0.39563
## CHOLESTEROL -1.227e-04 4.661e-04 -0.263 0.79255
## BETADIET 8.864e-05 3.298e-05 2.688 0.00770 **
## RETDIET 6.235e-05 7.597e-05 0.821 0.41264
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6476 on 237 degrees of freedom
## Multiple R-squared: 0.2675, Adjusted R-squared: 0.2243
## F-statistic: 6.183 on 14 and 237 DF, p-value: 1.837e-10
par(mfrow=c(2,2))
plot(beta_plasma_t_full)
library(leaps)
subset <- regsubsets(log(BETAPLASMA+1)~., data = beta_plasma_t, nvmax = 12, method ="exhaustive")
sum_sub <- summary(subset)
n.t <- nrow(beta_plasma_t)
p <- rowSums(sum_sub$which)
SSEp <- sum_sub$rss
R2 <- sum_sub$rsq
Ra2 <- sum_sub$adjr2
Cp <- sum_sub$cp
AIC <- n.t*log(SSEp/n.t)+2*p
BIC <-n.t*log(SSEp/n.t)+log(n.t)*p
criteria <- cbind(sum_sub$which, SSEp, R2, Ra2, Cp, AIC, BIC)
#add the criteria of null model
beta_plasma_t_null<- lm(log(BETAPLASMA+1)~1, data = beta_plasma_t)
sse0 <- anova(beta_plasma_t_null)["Residuals",2]
r0 <- summary(beta_plasma_t_null)$r.squared
ra0 <- summary(beta_plasma_t_null)$adj.r.squared
p0 <- 1
cp0 <- (sse0/summary(beta_plasma_t_full)$sigma^2)-(n.t-2*p0)
aic0 <- n.t*log(sse0/n.t)+2*p0
bic0 <- n.t*log(sse0/n.t)+log(n.t)*p0
null <- c(1,rep(0,14),sse0,r0,ra0,cp0,aic0,bic0)
criteria <- rbind(null, criteria)
criteria <- as.data.frame(criteria)
criteria
## (Intercept) AGE SEXMALE SMOKSTATFORMER SMOKSTATNEVER QUETELET
## null 1 0 0 0 0 0
## 1 1 0 0 0 0 1
## 2 1 0 0 0 0 1
## 3 1 0 0 0 0 1
## 4 1 0 0 0 1 1
## 5 1 0 0 0 0 1
## 6 1 0 0 0 1 1
## 7 1 0 1 0 1 1
## 8 1 1 1 0 1 1
## 9 1 1 1 0 1 1
## 10 1 1 1 1 1 1
## 11 1 1 1 1 1 1
## 12 1 1 1 1 1 1
## VITUSENOT OFTEN VITUSEOFTEN CALORIES FAT FIBER ALCOHOL CHOLESTEROL
## null 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 1 0 0 0
## 4 0 0 0 1 0 0 0
## 5 1 1 0 1 0 0 0
## 6 1 1 0 1 0 0 0
## 7 1 1 0 1 0 0 0
## 8 1 1 0 1 0 0 0
## 9 1 1 0 1 1 0 0
## 10 1 1 0 1 1 0 0
## 11 1 1 1 0 1 1 0
## 12 1 1 1 0 1 1 0
## BETADIET RETDIET SSEp R2 Ra2 Cp AIC
## null 0 0 135.68773 0.00000000 0.00000000 73.568179 -154.0064
## 1 0 0 125.38785 0.07590872 0.07221235 51.006533 -171.9004
## 2 1 0 118.43776 0.12712991 0.12011890 36.432985 -184.2705
## 3 1 0 112.92261 0.16777581 0.15770859 25.281264 -194.2870
## 4 1 0 109.57139 0.19247383 0.17939648 19.289774 -199.8789
## 5 1 0 106.70709 0.21358333 0.19759925 14.459411 -204.5540
## 6 1 0 104.21510 0.23194898 0.21313957 10.516869 -208.5090
## 7 1 0 102.77856 0.24253608 0.22080556 9.091220 -210.0068
## 8 1 0 101.21785 0.25403827 0.22947986 7.369479 -211.8628
## 9 1 0 100.34463 0.26047383 0.23297079 7.287136 -212.0463
## 10 1 0 99.96620 0.26326281 0.23269280 8.384711 -210.9984
## 11 1 0 99.70535 0.26518522 0.23150621 9.762679 -209.6569
## 12 1 1 99.45994 0.26699386 0.23019021 11.177461 -208.2779
## BIC
## null -150.4770
## 1 -164.8415
## 2 -173.6822
## 3 -180.1693
## 4 -182.2317
## 5 -183.3775
## 6 -183.8030
## 7 -181.7714
## 8 -180.0979
## 9 -176.7520
## 10 -172.1747
## 11 -167.3037
## 12 -162.3953
which.best.sub <- data.frame(
Ra2 = which.max(criteria$Ra2),
Cp = which.min(criteria$Cp),
AIC = which.min(criteria$AIC),
BIC = which.min(criteria$BIC)
)
which.best.sub
## Ra2 Cp AIC BIC
## 1 10 10 10 7
rbind(criteria[10,],criteria[7,])
## (Intercept) AGE SEXMALE SMOKSTATFORMER SMOKSTATNEVER QUETELET VITUSENOT OFTEN
## 9 1 1 1 0 1 1 1
## 6 1 0 0 0 1 1 1
## VITUSEOFTEN CALORIES FAT FIBER ALCOHOL CHOLESTEROL BETADIET RETDIET SSEp
## 9 1 0 1 1 0 0 1 0 100.3446
## 6 1 0 1 0 0 0 1 0 104.2151
## R2 Ra2 Cp AIC BIC
## 9 0.2604738 0.2329708 7.287136 -212.0463 -176.752
## 6 0.2319490 0.2131396 10.516869 -208.5090 -183.803
By adjusted r squared, Mallow’s Cp and AIC criterion, the model containing AGE, SEX, SMOKSTAT, QUETELET, VITUSE, FAT, FIBER, BETADIET is the best. By BIC criteria which prefers a smaller model than AIC criteria, the model containing SMOKSTAT, QUETELET, VITUSE, FAT, BETADIET is the best.
#best subset from Cp and AIC and adjusted r square
best.sub1 <- lm(log(BETAPLASMA+1)~AGE+SEX+SMOKSTAT+QUETELET+VITUSE+FAT+FIBER+BETADIET, data = beta_plasma_t)
summary(best.sub1)
##
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ AGE + SEX + SMOKSTAT + QUETELET +
## VITUSE + FAT + FIBER + BETADIET, data = beta_plasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.99316 -0.35874 0.02324 0.39915 1.81732
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.171e+00 2.742e-01 18.862 < 2e-16 ***
## AGE 5.285e-03 3.026e-03 1.747 0.08199 .
## SEXMALE -3.162e-01 1.290e-01 -2.451 0.01495 *
## SMOKSTATFORMER 1.248e-01 1.307e-01 0.955 0.34046
## SMOKSTATNEVER 2.537e-01 1.279e-01 1.983 0.04846 *
## QUETELET -3.286e-02 6.756e-03 -4.863 2.08e-06 ***
## VITUSENOT OFTEN 2.477e-01 1.071e-01 2.313 0.02159 *
## VITUSEOFTEN 2.807e-01 9.676e-02 2.901 0.00406 **
## FAT -3.135e-03 1.309e-03 -2.395 0.01739 *
## FIBER 1.280e-02 9.614e-03 1.331 0.18437
## BETADIET 8.782e-05 3.247e-05 2.704 0.00733 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.644 on 241 degrees of freedom
## Multiple R-squared: 0.2633, Adjusted R-squared: 0.2327
## F-statistic: 8.612 on 10 and 241 DF, p-value: 5.164e-12
par(mfrow=c(2,2))
plot(best.sub1)
#best subset from BIC
best.sub2 <- lm(log(BETAPLASMA+1)~SMOKSTAT+QUETELET+VITUSE+FAT+BETADIET, data = beta_plasma_t)
summary(best.sub2)
##
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ SMOKSTAT + QUETELET + VITUSE +
## FAT + BETADIET, data = beta_plasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0236 -0.3646 0.0149 0.4152 1.7755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.4894529 0.2229717 24.620 < 2e-16 ***
## SMOKSTATFORMER 0.1688395 0.1295208 1.304 0.193609
## SMOKSTATNEVER 0.3262964 0.1258846 2.592 0.010117 *
## QUETELET -0.0336699 0.0067838 -4.963 1.3e-06 ***
## VITUSENOT OFTEN 0.2669619 0.1058477 2.522 0.012302 *
## VITUSEOFTEN 0.3145361 0.0969823 3.243 0.001347 **
## FAT -0.0034961 0.0012253 -2.853 0.004700 **
## BETADIET 0.0001100 0.0000288 3.821 0.000169 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6513 on 244 degrees of freedom
## Multiple R-squared: 0.2373, Adjusted R-squared: 0.2154
## F-statistic: 10.84 on 7 and 244 DF, p-value: 6.703e-12
par(mfrow=c(2,2))
plot(best.sub2)
library(MASS)
step.sub1 <- stepAIC(beta_plasma_t_null, scope = list(upper=beta_plasma_t_full, lower=~1), direction = "both", k = 2, trace = FALSE)
step.model1 <- lm(log(BETAPLASMA+1) ~ QUETELET + VITUSE + BETADIET +
FAT + SMOKSTAT + SEX + AGE, data = beta_plasma_t)
summary(step.model1)
##
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ QUETELET + VITUSE + BETADIET +
## FAT + SMOKSTAT + SEX + AGE, data = beta_plasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.95934 -0.34445 0.03701 0.39190 1.76567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.254e+00 2.675e-01 19.639 < 2e-16 ***
## QUETELET -3.391e-02 6.720e-03 -5.046 8.85e-07 ***
## VITUSENOT OFTEN 2.542e-01 1.072e-01 2.372 0.018497 *
## VITUSEOFTEN 2.866e-01 9.682e-02 2.960 0.003382 **
## BETADIET 1.085e-04 2.855e-05 3.802 0.000182 ***
## FAT -2.656e-03 1.260e-03 -2.107 0.036150 *
## SMOKSTATFORMER 1.449e-01 1.300e-01 1.114 0.266318
## SMOKSTATNEVER 2.805e-01 1.265e-01 2.217 0.027533 *
## SEXMALE -3.072e-01 1.290e-01 -2.381 0.018028 *
## AGE 5.266e-03 3.031e-03 1.737 0.083572 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6451 on 242 degrees of freedom
## Multiple R-squared: 0.2578, Adjusted R-squared: 0.2302
## F-statistic: 9.342 on 9 and 242 DF, p-value: 3.615e-12
beta_plasma_t_full2 <- lm(log(BETAPLASMA+1)~.^2, data = beta_plasma_t)
step.sub2 <- stepAIC(beta_plasma_t_null, scope = list(upper=beta_plasma_t_full2, lower=~1), direction = "both", k = 2, trace = FALSE)
step.model2 <- lm(log(BETAPLASMA +1) ~ CHOLESTEROL + FIBER + QUETELET + VITUSE + BETADIET + AGE + RETDIET + SMOKSTAT + VITUSE:BETADIET + CHOLESTEROL:RETDIET + AGE:RETDIET + VITUSE:SMOKSTAT + RETDIET:SMOKSTAT + CHOLESTEROL:QUETELET + BETADIET:AGE + FIBER:SMOKSTAT + CHOLESTEROL:FIBER, data = beta_plasma_t)
summary(step.model2)
##
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ CHOLESTEROL + FIBER + QUETELET +
## VITUSE + BETADIET + AGE + RETDIET + SMOKSTAT + VITUSE:BETADIET +
## CHOLESTEROL:RETDIET + AGE:RETDIET + VITUSE:SMOKSTAT + RETDIET:SMOKSTAT +
## CHOLESTEROL:QUETELET + BETADIET:AGE + FIBER:SMOKSTAT + CHOLESTEROL:FIBER,
## data = beta_plasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97244 -0.33469 0.01207 0.36536 1.66268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.133e+00 5.947e-01 10.313 < 2e-16 ***
## CHOLESTEROL -2.140e-03 1.496e-03 -1.431 0.15395
## FIBER -2.775e-02 2.923e-02 -0.949 0.34340
## QUETELET -4.421e-02 1.466e-02 -3.016 0.00286 **
## VITUSENOT OFTEN -6.311e-02 2.944e-01 -0.214 0.83045
## VITUSEOFTEN -5.191e-01 2.979e-01 -1.743 0.08277 .
## BETADIET 1.576e-04 1.351e-04 1.167 0.24460
## AGE 2.126e-03 6.866e-03 0.310 0.75716
## RETDIET -2.040e-04 3.569e-04 -0.572 0.56811
## SMOKSTATFORMER -3.972e-01 3.918e-01 -1.014 0.31169
## SMOKSTATNEVER -3.612e-01 3.663e-01 -0.986 0.32520
## VITUSENOT OFTEN:BETADIET 2.226e-05 8.075e-05 0.276 0.78303
## VITUSEOFTEN:BETADIET 1.525e-04 7.083e-05 2.153 0.03236 *
## CHOLESTEROL:RETDIET -1.248e-07 4.243e-07 -0.294 0.76888
## AGE:RETDIET 8.134e-06 6.120e-06 1.329 0.18517
## VITUSENOT OFTEN:SMOKSTATFORMER 6.574e-01 3.312e-01 1.985 0.04836 *
## VITUSEOFTEN:SMOKSTATFORMER 7.090e-01 3.190e-01 2.222 0.02725 *
## VITUSENOT OFTEN:SMOKSTATNEVER 1.457e-01 3.269e-01 0.446 0.65626
## VITUSEOFTEN:SMOKSTATNEVER 4.893e-01 3.062e-01 1.598 0.11144
## RETDIET:SMOKSTATFORMER -1.812e-04 2.971e-04 -0.610 0.54265
## RETDIET:SMOKSTATNEVER -1.026e-04 2.672e-04 -0.384 0.70122
## CHOLESTEROL:QUETELET 6.264e-05 5.109e-05 1.226 0.22142
## BETADIET:AGE -2.800e-06 2.390e-06 -1.171 0.24272
## FIBER:SMOKSTATFORMER 3.364e-02 2.828e-02 1.189 0.23551
## FIBER:SMOKSTATNEVER 5.296e-02 2.668e-02 1.985 0.04837 *
## CHOLESTEROL:FIBER -3.074e-05 6.912e-05 -0.445 0.65695
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.642 on 226 degrees of freedom
## Multiple R-squared: 0.3136, Adjusted R-squared: 0.2376
## F-statistic: 4.129 on 25 and 226 DF, p-value: 3.895e-09
#find SSE for step.model1 and step.model2
sse.bs1 <- anova(best.sub1)["Residuals",2]
sse.bs2 <- anova(best.sub2)["Residuals",2]
sse.sm1 <- anova(step.model1)["Residuals",2]
sse.sm2 <- anova(step.model2)["Residuals",2]
sse.compare <- c(sse.bs1,sse.bs2,sse.sm1,sse.sm2)
#find MSE for step.model1 and step.model2
mse.bs1 <- anova(best.sub1)["Residuals",3]
mse.bs2 <- anova(best.sub2)["Residuals",3]
mse.sm1 <- anova(step.model1)["Residuals",3]
mse.sm2 <- anova(step.model2)["Residuals",3]
mse.compare <- c(mse.bs1,mse.bs2,mse.sm1,mse.sm2)
#find p for step.model1 and step.model2
p.bs1 <-9
p.bs2 <-6
p.sm1 <-7
p.sm2 <-17
p.compare <- c(p.bs1,p.bs2,p.sm1,p.sm2)
#find Cp for step.model1 and step.model2
#sigma^2: MSE_fullmodel(create a fit3 related to step.model1)
#n.t: nrow(training data)
beta_plasma_tt2 <- beta_plasma_t[,c("BETAPLASMA","AGE","SEX","SMOKSTAT","QUETELET","VITUSE","FAT","FIBER","BETADIET")]
Full4 <- lm(log(BETAPLASMA+1)~.^2, data=beta_plasma_tt2)
mse4 <- anova(Full4)['Residuals',3]
beta_plasma_tt <- beta_plasma_t[,c("BETAPLASMA","QUETELET","VITUSE","CHOLESTEROL","BETADIET","SMOKSTAT","SEX")]
Full3 <- lm(log(BETAPLASMA+1)~.^2, data=beta_plasma_tt)
length(Full3$coef)
## [1] 35
mse3 <- anova(Full3)['Residuals',3]
cp.bs1 <- (sse.bs1/mse4)-(n.t-2*p.bs1)
cp.bs2 <- (sse.bs2/mse4)-(n.t-2*p.bs2)
cp.sm1 <- (sse.sm1/mse3)-(n.t-2*p.sm1)
cp.sm2 <- (sse.sm2/mse3)-(n.t-2*p.sm2)
cp.compare <- c(cp.bs1,cp.bs2,cp.sm1,cp.sm2)
#find Pressp for step.model1 and step.model2
press.bs1 <- sum(best.sub1$residuals^2/(1-influence(best.sub1)$hat)^2)
press.bs2 <- sum(best.sub2$residuals^2/(1-influence(best.sub2)$hat)^2)
press.sm1 <- sum(step.model1$residuals^2/(1-influence(step.model1)$hat)^2)
press.sm2 <- sum(step.model2$residuals^2/(1-influence(step.model2)$hat)^2)
press.compare <- c(press.bs1,press.bs2,press.sm1,press.sm2)
compare <- data.frame(sse=sse.compare, mse=mse.compare, p=p.compare, cp=cp.compare, press=press.compare)
rownames(compare) <- c("best subset 1","best subset 2","stepwise mode1", "stepwise model2")
compare
## sse mse p cp press
## best subset 1 99.96620 0.4147975 9 12.972734 109.5064
## best subset 2 103.49433 0.4241571 6 15.689211 110.4410
## stepwise mode1 100.70131 0.4161211 7 9.511715 109.3472
## stepwise model2 93.14226 0.4121339 17 10.932479 132.0332
##best subsets, stepwise models on validation data
best.sub1.v <- lm(best.sub1, data = beta_plasma_v)
best.sub2.v <- lm(best.sub2, data = beta_plasma_v)
step.model1.v <- lm(step.model1, data = beta_plasma_v)
step.model2.v <- lm(step.model2, data = beta_plasma_v)
#summary on training data and validation data respectively
list(summary(best.sub1), summary(best.sub2),summary(step.model1),summary(step.model2))
## [[1]]
##
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ AGE + SEX + SMOKSTAT + QUETELET +
## VITUSE + FAT + FIBER + BETADIET, data = beta_plasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.99316 -0.35874 0.02324 0.39915 1.81732
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.171e+00 2.742e-01 18.862 < 2e-16 ***
## AGE 5.285e-03 3.026e-03 1.747 0.08199 .
## SEXMALE -3.162e-01 1.290e-01 -2.451 0.01495 *
## SMOKSTATFORMER 1.248e-01 1.307e-01 0.955 0.34046
## SMOKSTATNEVER 2.537e-01 1.279e-01 1.983 0.04846 *
## QUETELET -3.286e-02 6.756e-03 -4.863 2.08e-06 ***
## VITUSENOT OFTEN 2.477e-01 1.071e-01 2.313 0.02159 *
## VITUSEOFTEN 2.807e-01 9.676e-02 2.901 0.00406 **
## FAT -3.135e-03 1.309e-03 -2.395 0.01739 *
## FIBER 1.280e-02 9.614e-03 1.331 0.18437
## BETADIET 8.782e-05 3.247e-05 2.704 0.00733 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.644 on 241 degrees of freedom
## Multiple R-squared: 0.2633, Adjusted R-squared: 0.2327
## F-statistic: 8.612 on 10 and 241 DF, p-value: 5.164e-12
##
##
## [[2]]
##
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ SMOKSTAT + QUETELET + VITUSE +
## FAT + BETADIET, data = beta_plasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0236 -0.3646 0.0149 0.4152 1.7755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.4894529 0.2229717 24.620 < 2e-16 ***
## SMOKSTATFORMER 0.1688395 0.1295208 1.304 0.193609
## SMOKSTATNEVER 0.3262964 0.1258846 2.592 0.010117 *
## QUETELET -0.0336699 0.0067838 -4.963 1.3e-06 ***
## VITUSENOT OFTEN 0.2669619 0.1058477 2.522 0.012302 *
## VITUSEOFTEN 0.3145361 0.0969823 3.243 0.001347 **
## FAT -0.0034961 0.0012253 -2.853 0.004700 **
## BETADIET 0.0001100 0.0000288 3.821 0.000169 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6513 on 244 degrees of freedom
## Multiple R-squared: 0.2373, Adjusted R-squared: 0.2154
## F-statistic: 10.84 on 7 and 244 DF, p-value: 6.703e-12
##
##
## [[3]]
##
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ QUETELET + VITUSE + BETADIET +
## FAT + SMOKSTAT + SEX + AGE, data = beta_plasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.95934 -0.34445 0.03701 0.39190 1.76567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.254e+00 2.675e-01 19.639 < 2e-16 ***
## QUETELET -3.391e-02 6.720e-03 -5.046 8.85e-07 ***
## VITUSENOT OFTEN 2.542e-01 1.072e-01 2.372 0.018497 *
## VITUSEOFTEN 2.866e-01 9.682e-02 2.960 0.003382 **
## BETADIET 1.085e-04 2.855e-05 3.802 0.000182 ***
## FAT -2.656e-03 1.260e-03 -2.107 0.036150 *
## SMOKSTATFORMER 1.449e-01 1.300e-01 1.114 0.266318
## SMOKSTATNEVER 2.805e-01 1.265e-01 2.217 0.027533 *
## SEXMALE -3.072e-01 1.290e-01 -2.381 0.018028 *
## AGE 5.266e-03 3.031e-03 1.737 0.083572 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6451 on 242 degrees of freedom
## Multiple R-squared: 0.2578, Adjusted R-squared: 0.2302
## F-statistic: 9.342 on 9 and 242 DF, p-value: 3.615e-12
##
##
## [[4]]
##
## Call:
## lm(formula = log(BETAPLASMA + 1) ~ CHOLESTEROL + FIBER + QUETELET +
## VITUSE + BETADIET + AGE + RETDIET + SMOKSTAT + VITUSE:BETADIET +
## CHOLESTEROL:RETDIET + AGE:RETDIET + VITUSE:SMOKSTAT + RETDIET:SMOKSTAT +
## CHOLESTEROL:QUETELET + BETADIET:AGE + FIBER:SMOKSTAT + CHOLESTEROL:FIBER,
## data = beta_plasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97244 -0.33469 0.01207 0.36536 1.66268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.133e+00 5.947e-01 10.313 < 2e-16 ***
## CHOLESTEROL -2.140e-03 1.496e-03 -1.431 0.15395
## FIBER -2.775e-02 2.923e-02 -0.949 0.34340
## QUETELET -4.421e-02 1.466e-02 -3.016 0.00286 **
## VITUSENOT OFTEN -6.311e-02 2.944e-01 -0.214 0.83045
## VITUSEOFTEN -5.191e-01 2.979e-01 -1.743 0.08277 .
## BETADIET 1.576e-04 1.351e-04 1.167 0.24460
## AGE 2.126e-03 6.866e-03 0.310 0.75716
## RETDIET -2.040e-04 3.569e-04 -0.572 0.56811
## SMOKSTATFORMER -3.972e-01 3.918e-01 -1.014 0.31169
## SMOKSTATNEVER -3.612e-01 3.663e-01 -0.986 0.32520
## VITUSENOT OFTEN:BETADIET 2.226e-05 8.075e-05 0.276 0.78303
## VITUSEOFTEN:BETADIET 1.525e-04 7.083e-05 2.153 0.03236 *
## CHOLESTEROL:RETDIET -1.248e-07 4.243e-07 -0.294 0.76888
## AGE:RETDIET 8.134e-06 6.120e-06 1.329 0.18517
## VITUSENOT OFTEN:SMOKSTATFORMER 6.574e-01 3.312e-01 1.985 0.04836 *
## VITUSEOFTEN:SMOKSTATFORMER 7.090e-01 3.190e-01 2.222 0.02725 *
## VITUSENOT OFTEN:SMOKSTATNEVER 1.457e-01 3.269e-01 0.446 0.65626
## VITUSEOFTEN:SMOKSTATNEVER 4.893e-01 3.062e-01 1.598 0.11144
## RETDIET:SMOKSTATFORMER -1.812e-04 2.971e-04 -0.610 0.54265
## RETDIET:SMOKSTATNEVER -1.026e-04 2.672e-04 -0.384 0.70122
## CHOLESTEROL:QUETELET 6.264e-05 5.109e-05 1.226 0.22142
## BETADIET:AGE -2.800e-06 2.390e-06 -1.171 0.24272
## FIBER:SMOKSTATFORMER 3.364e-02 2.828e-02 1.189 0.23551
## FIBER:SMOKSTATNEVER 5.296e-02 2.668e-02 1.985 0.04837 *
## CHOLESTEROL:FIBER -3.074e-05 6.912e-05 -0.445 0.65695
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.642 on 226 degrees of freedom
## Multiple R-squared: 0.3136, Adjusted R-squared: 0.2376
## F-statistic: 4.129 on 25 and 226 DF, p-value: 3.895e-09
list(summary(best.sub1.v), summary(best.sub2.v),summary(step.model1.v),summary(step.model2.v))
## [[1]]
##
## Call:
## lm(formula = best.sub1, data = beta_plasma_v)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6792 -0.3864 0.0216 0.3397 1.8382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.568e+00 1.001e+00 4.566 3.08e-05 ***
## AGE 5.231e-03 1.002e-02 0.522 0.60370
## SEXMALE 6.092e-01 4.202e-01 1.450 0.15314
## SMOKSTATFORMER 6.603e-01 4.100e-01 1.610 0.11336
## SMOKSTATNEVER 3.660e-01 4.149e-01 0.882 0.38186
## QUETELET -3.833e-02 2.212e-02 -1.733 0.08904 .
## VITUSENOT OFTEN 5.585e-01 3.373e-01 1.656 0.10382
## VITUSEOFTEN 2.727e-01 3.081e-01 0.885 0.38018
## FAT -8.900e-03 4.291e-03 -2.074 0.04303 *
## FIBER 6.452e-02 2.273e-02 2.839 0.00644 **
## BETADIET 6.002e-06 8.446e-05 0.071 0.94362
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8669 on 52 degrees of freedom
## Multiple R-squared: 0.3461, Adjusted R-squared: 0.2204
## F-statistic: 2.753 on 10 and 52 DF, p-value: 0.008433
##
##
## [[2]]
##
## Call:
## lm(formula = best.sub2, data = beta_plasma_v)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0121 -0.4015 0.0569 0.4081 1.8483
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.604e+00 8.618e-01 6.502 2.46e-08 ***
## SMOKSTATFORMER 7.871e-01 4.376e-01 1.799 0.0776 .
## SMOKSTATNEVER 6.101e-01 4.373e-01 1.395 0.1685
## QUETELET -5.222e-02 2.307e-02 -2.264 0.0276 *
## VITUSENOT OFTEN 3.463e-01 3.541e-01 0.978 0.3323
## VITUSEOFTEN 1.479e-01 3.267e-01 0.453 0.6526
## FAT -4.788e-03 3.951e-03 -1.212 0.2308
## BETADIET 7.305e-05 8.459e-05 0.864 0.3915
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.934 on 55 degrees of freedom
## Multiple R-squared: 0.1972, Adjusted R-squared: 0.09503
## F-statistic: 1.93 on 7 and 55 DF, p-value: 0.08212
##
##
## [[3]]
##
## Call:
## lm(formula = step.model1, data = beta_plasma_v)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8359 -0.3859 -0.0263 0.4063 1.8168
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.911e+00 1.057e+00 4.645 2.29e-05 ***
## QUETELET -4.886e-02 2.321e-02 -2.105 0.0400 *
## VITUSENOT OFTEN 4.585e-01 3.571e-01 1.284 0.2048
## VITUSEOFTEN 1.977e-01 3.267e-01 0.605 0.5476
## BETADIET 6.001e-05 8.760e-05 0.685 0.4963
## FAT -5.466e-03 4.382e-03 -1.247 0.2178
## SMOKSTATFORMER 7.954e-01 4.335e-01 1.835 0.0721 .
## SMOKSTATNEVER 6.085e-01 4.323e-01 1.408 0.1651
## SEXMALE 3.904e-01 4.397e-01 0.888 0.3787
## AGE 1.223e-02 1.033e-02 1.183 0.2421
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9228 on 53 degrees of freedom
## Multiple R-squared: 0.2448, Adjusted R-squared: 0.1165
## F-statistic: 1.909 on 9 and 53 DF, p-value: 0.07065
##
##
## [[4]]
##
## Call:
## lm(formula = step.model2, data = beta_plasma_v)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84211 -0.30962 -0.03657 0.24799 1.28694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.201e+00 5.805e+00 0.896 0.376
## CHOLESTEROL -5.417e-03 6.561e-03 -0.826 0.414
## FIBER -2.267e-01 4.151e-01 -0.546 0.588
## QUETELET -4.983e-02 3.671e-02 -1.358 0.183
## VITUSENOT OFTEN 1.276e+00 1.252e+00 1.019 0.315
## VITUSEOFTEN 4.022e-01 1.890e+00 0.213 0.833
## BETADIET 9.551e-05 3.281e-04 0.291 0.773
## AGE 1.826e-02 2.356e-02 0.775 0.443
## RETDIET 2.963e-03 1.868e-03 1.586 0.121
## SMOKSTATFORMER 4.174e-01 6.238e+00 0.067 0.947
## SMOKSTATNEVER -4.727e-01 6.188e+00 -0.076 0.940
## VITUSENOT OFTEN:BETADIET -2.327e-04 2.004e-04 -1.161 0.253
## VITUSEOFTEN:BETADIET -1.060e-04 1.562e-04 -0.679 0.502
## CHOLESTEROL:RETDIET -4.268e-06 9.081e-07 -4.701 3.54e-05 ***
## AGE:RETDIET -1.716e-05 2.274e-05 -0.755 0.455
## VITUSENOT OFTEN:SMOKSTATFORMER -3.704e-01 1.275e+00 -0.291 0.773
## VITUSEOFTEN:SMOKSTATFORMER 3.954e-01 1.936e+00 0.204 0.839
## VITUSENOT OFTEN:SMOKSTATNEVER -9.741e-01 1.304e+00 -0.747 0.460
## VITUSEOFTEN:SMOKSTATNEVER -2.921e-01 1.920e+00 -0.152 0.880
## RETDIET:SMOKSTATFORMER -1.644e-03 1.413e-03 -1.164 0.252
## RETDIET:SMOKSTATNEVER -1.064e-03 1.448e-03 -0.734 0.467
## CHOLESTEROL:QUETELET 1.045e-04 1.677e-04 0.623 0.537
## BETADIET:AGE -1.036e-06 6.708e-06 -0.155 0.878
## FIBER:SMOKSTATFORMER 1.445e-01 4.223e-01 0.342 0.734
## FIBER:SMOKSTATNEVER 2.075e-01 4.147e-01 0.500 0.620
## CHOLESTEROL:FIBER 4.882e-04 3.086e-04 1.582 0.122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6017 on 37 degrees of freedom
## Multiple R-squared: 0.7758, Adjusted R-squared: 0.6244
## F-statistic: 5.122 on 25 and 37 DF, p-value: 4.377e-06
#percent change in parameter estimation
pct_chg.coef <- function(model, modelv, digit){
coef.m <- coef(model)
coef.v <- coef(modelv)
pct.chg.coef <- round(abs(coef.v-coef.m)/abs(coef.m)*100, digit)
pct.chg.coef
}
pct_chg.coef.bs1 <- pct_chg.coef(best.sub1, best.sub1.v, 2)
pct_chg.coef.bs2 <- pct_chg.coef(best.sub2, best.sub2.v, 2)
pct_chg.coef.sm1 <- pct_chg.coef(step.model1, step.model1.v, 2)
pct_chg.coef.sm2 <- pct_chg.coef(step.model2, step.model2.v, 2)
#percent change in standard error
pct_chg.sd <- function(model, modelv, digit){
sd.m <- summary(model)$coefficients[, "Std. Error"]
sd.v <- summary(modelv)$coefficients[, "Std. Error"]
pct.chg.sd <- round(abs(sd.v-sd.m)/abs(sd.m)*100, digit)
pct.chg.sd
}
pct_chg.sd.bs1 <- pct_chg.sd(best.sub1, best.sub1.v, 2)
pct_chg.sd.bs2 <- pct_chg.sd(best.sub2, best.sub2.v, 2)
pct_chg.sd.sm1 <- pct_chg.sd(step.model1, step.model1.v, 2)
pct_chg.sd.sm2 <- pct_chg.sd(step.model2, step.model2.v, 2)
pct_chg_beta_bs1 <- data.frame(pct_chg.coef.bs1,pct_chg.sd.bs1)
pct_chg_beta_bs1
## pct_chg.coef.bs1 pct_chg.sd.bs1
## (Intercept) 11.66 264.96
## AGE 1.02 231.02
## SEXMALE 292.67 225.76
## SMOKSTATFORMER 428.87 213.68
## SMOKSTATNEVER 44.25 224.40
## QUETELET 16.65 227.37
## VITUSENOT OFTEN 125.46 214.91
## VITUSEOFTEN 2.88 218.36
## FAT 183.93 227.83
## FIBER 404.10 136.38
## BETADIET 93.17 160.10
pct_chg_beta_bs2 <- data.frame(pct_chg.coef.bs2,pct_chg.sd.bs2)
pct_chg_beta_bs2
## pct_chg.coef.bs2 pct_chg.sd.bs2
## (Intercept) 2.08 286.53
## SMOKSTATFORMER 366.16 237.85
## SMOKSTATNEVER 86.99 247.35
## QUETELET 55.08 240.05
## VITUSENOT OFTEN 29.73 234.51
## VITUSEOFTEN 52.99 236.89
## FAT 36.94 222.44
## BETADIET 33.61 193.73
pct_chg_beta_sm1 <- data.frame(pct_chg.coef.sm1,pct_chg.sd.sm1)
pct_chg_beta_sm1
## pct_chg.coef.sm1 pct_chg.sd.sm1
## (Intercept) 6.52 295.25
## QUETELET 44.09 245.38
## VITUSENOT OFTEN 80.38 233.21
## VITUSEOFTEN 31.00 237.46
## BETADIET 44.70 206.85
## FAT 105.81 247.69
## SMOKSTATFORMER 448.97 233.35
## SMOKSTATNEVER 116.91 241.66
## SEXMALE 227.06 240.82
## AGE 132.16 241.01
pct_chg_beta_sm2 <- data.frame(pct_chg.coef.sm2,pct_chg.sd.sm2)
pct_chg_beta_sm2
## pct_chg.coef.sm2 pct_chg.sd.sm2
## (Intercept) 15.19 876.07
## CHOLESTEROL 153.15 338.65
## FIBER 716.81 1319.89
## QUETELET 12.73 150.40
## VITUSENOT OFTEN 2121.14 325.17
## VITUSEOFTEN 177.48 534.40
## BETADIET 39.39 142.91
## AGE 759.02 243.14
## RETDIET 1552.21 423.38
## SMOKSTATFORMER 205.08 1492.22
## SMOKSTATNEVER 30.88 1589.37
## VITUSENOT OFTEN:BETADIET 1145.15 148.24
## VITUSEOFTEN:BETADIET 169.51 120.54
## CHOLESTEROL:RETDIET 3319.38 114.00
## AGE:RETDIET 310.93 271.53
## VITUSENOT OFTEN:SMOKSTATFORMER 156.35 284.97
## VITUSEOFTEN:SMOKSTATFORMER 44.23 506.82
## VITUSENOT OFTEN:SMOKSTATNEVER 768.50 298.94
## VITUSEOFTEN:SMOKSTATNEVER 159.69 526.88
## RETDIET:SMOKSTATFORMER 807.22 375.39
## RETDIET:SMOKSTATNEVER 936.15 442.05
## CHOLESTEROL:QUETELET 66.86 228.24
## BETADIET:AGE 62.98 180.64
## FIBER:SMOKSTATFORMER 329.39 1393.12
## FIBER:SMOKSTATNEVER 291.79 1454.28
## CHOLESTEROL:FIBER 1688.22 346.50
colnames(pct_chg_beta_bs1) <- c("change in coefficient(%)","change in standard deviation(%)" )
colnames(pct_chg_beta_bs2) <- c("change in coefficient(%)","change in standard deviation(%)" )
colnames(pct_chg_beta_sm1) <- c("change in coefficient(%)","change in standard deviation(%)" )
colnames(pct_chg_beta_sm2) <- c("change in coefficient(%)","change in standard deviation(%)" )
#mean squared prediction error
mspe <- function(model, dv, data){
yhat <- predict(model, newdata = data)
y <- data[[dv]]
mean((y-yhat)^2)
}
mspe.bs1 <- mspe(best.sub1, "BETAPLASMA", beta_plasma_v)
mspe.bs2 <- mspe(best.sub2, "BETAPLASMA", beta_plasma_v)
mspe.sm1 <- mspe(step.model1, "BETAPLASMA", beta_plasma_v)
mspe.sm2 <- mspe(step.model2, "BETAPLASMA", beta_plasma_v)
#compare with Pressp/n and SSEp/n
best.sub1.compare <- c(mspe.bs1,press.bs1/n.t,mse.bs1)
best.sub2.compare <- c(mspe.bs2,press.bs2/n.t,mse.bs2)
step.model1.compare <- c(mspe.sm1,press.sm1/n.t,mse.sm1)
step.model2.compare <- c(mspe.sm2,press.sm2/n.t,mse.sm2)
compare2 <-data.frame(best.sub1.compare, best.sub2.compare, step.model1.compare,step.model2.compare)
compare2 <- as.data.frame(t(compare2))
colnames(compare2) <- c("MSPE", "Press/n", "MSE")
rownames(compare2) <- c("Best Subset1", "Best Subset2", "Stepwise model1","Stepwise model2")
compare2
## MSPE Press/n MSE
## Best Subset1 58248.69 0.4345493 0.4147975
## Best Subset2 58256.81 0.4382581 0.4241571
## Stepwise model1 58260.92 0.4339174 0.4161211
## Stepwise model2 58228.58 0.5239414 0.4121339
best.sub1_final <- lm(best.sub1, data = beta_plasma)
summary(best.sub1_final)
##
## Call:
## lm(formula = best.sub1, data = beta_plasma)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5102 -0.3739 0.0314 0.3991 1.9367
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.061e+00 2.783e-01 18.185 < 2e-16 ***
## AGE 6.268e-03 3.007e-03 2.084 0.03795 *
## SEXMALE -2.097e-01 1.289e-01 -1.627 0.10474
## SMOKSTATFORMER 1.996e-01 1.299e-01 1.537 0.12543
## SMOKSTATNEVER 2.493e-01 1.287e-01 1.938 0.05357 .
## QUETELET -3.405e-02 6.768e-03 -5.032 8.33e-07 ***
## VITUSENOT OFTEN 2.581e-01 1.051e-01 2.455 0.01464 *
## VITUSEOFTEN 2.366e-01 9.538e-02 2.480 0.01367 *
## FAT -3.733e-03 1.307e-03 -2.857 0.00457 **
## FIBER 2.309e-02 8.984e-03 2.570 0.01065 *
## BETADIET 6.059e-05 3.109e-05 1.949 0.05224 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7046 on 304 degrees of freedom
## Multiple R-squared: 0.2309, Adjusted R-squared: 0.2056
## F-statistic: 9.127 on 10 and 304 DF, p-value: 3.437e-13
anova(best.sub1_final)
## Analysis of Variance Table
##
## Response: log(BETAPLASMA + 1)
## Df Sum Sq Mean Sq F value Pr(>F)
## AGE 1 3.915 3.9152 7.8853 0.0053062 **
## SEX 1 5.320 5.3202 10.7149 0.0011853 **
## SMOKSTAT 2 3.583 1.7914 3.6079 0.0282756 *
## QUETELET 1 16.674 16.6740 33.5816 1.708e-08 ***
## VITUSE 2 5.184 2.5918 5.2200 0.0059027 **
## FAT 1 1.422 1.4217 2.8632 0.0916496 .
## FIBER 1 7.336 7.3362 14.7751 0.0001475 ***
## BETADIET 1 1.886 1.8858 3.7979 0.0522356 .
## Residuals 304 150.942 0.4965
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,1))
plot(best.sub1_final, which=c(1,2))
#check outliers in Y
#n <- nrow(totaldata)
n <- nrow(beta_plasma)
std.d.res <- studres(best.sub1_final)
p.res <- 17
bon_thred <- qt(1-0.1/(2*n), n-p.res)
outlier.Y <- as.vector(which(abs(std.d.res) >= bon_thred))
outlier.Y
## [1] 257
hii <- influence(best.sub1_final)$hat
#check outliers in X
outlier.X <- as.vector(which(hii>(2*p.res/n)))
outlier.X
## [1] 152 225
plot(hii, residuals(best.sub1_final), xlab = "leverage", ylab="residuals")
#influence index plot
plot(best.sub1_final, which = 4)
best.sub1_final2 <- lm(best.sub1_final, data = beta_plasma[-c(35,39,257),])
fitted.sm2 <- fitted(best.sub1_final)
fitted.sm2.rm <- fitted(best.sub1_final2)
SUM <- sum(abs((fitted.sm2[-c(35,39,257)]-fitted.sm2.rm)/fitted.sm2[-c(35,39,257)]))
SUM <- SUM +abs((fitted.sm2[c(35,39,257)]-predict(step.model2, newdata = beta_plasma[c(35,39,257),]))/fitted.sm2[c(35,39,257)])
per.average <- SUM/n
per.average
## 35 39 257
## 0.010107249 0.009746335 0.009944555
summary(best.sub1_final2)
##
## Call:
## lm(formula = best.sub1_final, data = beta_plasma[-c(35, 39, 257),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.96774 -0.37675 0.00819 0.39383 1.89819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.881e+00 2.545e-01 19.181 < 2e-16 ***
## AGE 6.502e-03 2.733e-03 2.379 0.01796 *
## SEXMALE -3.390e-01 1.194e-01 -2.839 0.00483 **
## SMOKSTATFORMER 2.243e-01 1.188e-01 1.888 0.05992 .
## SMOKSTATNEVER 3.108e-01 1.175e-01 2.646 0.00857 **
## QUETELET -3.095e-02 6.150e-03 -5.033 8.33e-07 ***
## VITUSENOT OFTEN 2.114e-01 9.597e-02 2.203 0.02834 *
## VITUSEOFTEN 2.428e-01 8.720e-02 2.785 0.00570 **
## FAT -2.813e-03 1.207e-03 -2.331 0.02043 *
## FIBER 2.485e-02 8.173e-03 3.041 0.00257 **
## BETADIET 5.344e-05 2.835e-05 1.885 0.06042 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6389 on 301 degrees of freedom
## Multiple R-squared: 0.2677, Adjusted R-squared: 0.2433
## F-statistic: 11 on 10 and 301 DF, p-value: 5.506e-16
The potential influential cases identified previously is the 35th, 39th, and 257th cases, we fit the model without 35th,39th and 257th cases and calculate the average absolute difference in the fitted values as 1.01%, 0.97% and 0.99% respectively. For 35th, 39th and 257th cases, the percentage change on the fitted value with or without the case is very small. Therefore, no case have an unduly large influence on prediction and thus all cases may be retained.
retplasma <- plasma[,-13]
fit.pre <- lm(RETPLASMA~., data = retplasma)
par(mfrow=c(2,2))
plot(fit.pre, which=c(1,2))
boxcox(fit.pre)
hist(log(plasma$RETPLASMA), xlab ="log(RETPLASMA)", main = "Histogram of log(RETPLASMA)")
fit.pre.trans <- lm(log(RETPLASMA)~., data = retplasma)
par(mfrow=c(2,2))
plot(fit.pre.trans, which=c(1,2))
summary(fit.pre.trans)
##
## Call:
## lm(formula = log(RETPLASMA) ~ ., data = retplasma)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.09081 -0.19304 0.01105 0.21121 0.94285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.138e+00 1.339e-01 45.849 <2e-16 ***
## AGE 4.707e-03 1.469e-03 3.204 0.0015 **
## SEXMALE 1.024e-01 6.229e-02 1.644 0.1013
## SMOKSTATFORMER 1.162e-01 6.128e-02 1.896 0.0589 .
## SMOKSTATNEVER 1.684e-02 6.079e-02 0.277 0.7820
## QUETELET 2.193e-05 3.208e-03 0.007 0.9945
## VITUSENOT OFTEN 3.862e-02 4.975e-02 0.776 0.4382
## VITUSEOFTEN 2.100e-02 4.554e-02 0.461 0.6451
## CALORIES 1.506e-04 1.002e-04 1.503 0.1338
## FAT -2.694e-03 1.581e-03 -1.705 0.0893 .
## FIBER -7.495e-03 5.548e-03 -1.351 0.1778
## ALCOHOL -3.140e-03 2.436e-03 -1.289 0.1983
## CHOLESTEROL -1.646e-04 2.158e-04 -0.763 0.4461
## BETADIET -9.060e-06 1.471e-05 -0.616 0.5383
## RETDIET -1.162e-05 3.656e-05 -0.318 0.7507
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3304 on 300 degrees of freedom
## Multiple R-squared: 0.0984, Adjusted R-squared: 0.05632
## F-statistic: 2.339 on 14 and 300 DF, p-value: 0.004437
set.seed(1024)
index <- sample(1:315, ceiling(315*0.8) , replace = FALSE)
retplasma_t <- retplasma[index,] #training data
retplasma_v <- retplasma[-index,] #validation data
retplasma_t_full <- lm(log(RETPLASMA)~., data=retplasma_t)
summary(retplasma_t_full)
##
## Call:
## lm(formula = log(RETPLASMA) ~ ., data = retplasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05812 -0.19248 0.00453 0.21521 0.96596
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.220e+00 1.466e-01 42.445 < 2e-16 ***
## AGE 4.580e-03 1.648e-03 2.780 0.00588 **
## SEXMALE 3.301e-02 7.012e-02 0.471 0.63823
## SMOKSTATFORMER 8.902e-02 6.885e-02 1.293 0.19728
## SMOKSTATNEVER -3.568e-02 6.751e-02 -0.529 0.59760
## QUETELET -1.244e-03 3.568e-03 -0.349 0.72774
## VITUSENOT OFTEN 6.061e-03 5.662e-02 0.107 0.91483
## VITUSEOFTEN 1.945e-02 5.141e-02 0.378 0.70551
## CALORIES 1.205e-04 1.122e-04 1.074 0.28398
## FAT -2.891e-03 1.776e-03 -1.628 0.10489
## FIBER -5.772e-03 6.296e-03 -0.917 0.36016
## ALCOHOL -2.855e-03 2.655e-03 -1.075 0.28329
## CHOLESTEROL 9.716e-06 2.423e-04 0.040 0.96805
## BETADIET -1.527e-05 1.714e-05 -0.891 0.37392
## RETDIET 7.774e-06 3.949e-05 0.197 0.84411
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3366 on 237 degrees of freedom
## Multiple R-squared: 0.0928, Adjusted R-squared: 0.03921
## F-statistic: 1.732 on 14 and 237 DF, p-value: 0.05037
par(mfrow=c(2,2))
plot(retplasma_t_full)
## Model Selection ### Model selection approach 1: best subset ####
First order model
subset <- regsubsets(log(RETPLASMA)~., data = retplasma_t, nvmax = 12, method ="exhaustive")
sum_sub <- summary(subset)
n.t <- nrow(retplasma_t)
p <- rowSums(sum_sub$which)
SSEp <- sum_sub$rss
R2 <- sum_sub$rsq
Ra2 <- sum_sub$adjr2
Cp <- sum_sub$cp
AIC <- n.t*log(SSEp/n.t)+2*p
BIC <-n.t*log(SSEp/n.t)+log(n.t)*p
criteria <- cbind(sum_sub$which, SSEp, R2, Ra2, Cp, AIC, BIC)
#add the criteria of null model
retplasma_t_null<- lm(log(RETPLASMA)~1, data = retplasma_t)
sse0 <- anova(retplasma_t_null)["Residuals",2]
r0 <- summary(retplasma_t_null)$r.squared
ra0 <- summary(retplasma_t_null)$adj.r.squared
p0 <- 1
cp0 <- (sse0/summary(retplasma_t_full)$sigma^2)-(n.t-2*p0)
aic0 <- n.t*log(sse0/n.t)+2*p0
bic0 <- n.t*log(sse0/n.t)+log(n.t)*p0
null <- c(1,rep(0,14),sse0,r0,ra0,cp0,aic0,bic0)
criteria <- rbind(null, criteria)
criteria <- as.data.frame(criteria)
criteria
## (Intercept) AGE SEXMALE SMOKSTATFORMER SMOKSTATNEVER QUETELET
## null 1 0 0 0 0 0
## 1 1 1 0 0 0 0
## 2 1 1 0 1 0 0
## 3 1 1 0 1 0 0
## 4 1 1 0 1 0 0
## 5 1 1 0 1 1 0
## 6 1 1 0 1 0 0
## 7 1 1 0 1 0 0
## 8 1 1 0 1 1 0
## 9 1 1 1 1 1 0
## 10 1 1 1 1 1 0
## 11 1 1 1 1 1 1
## 12 1 1 1 1 1 1
## VITUSENOT OFTEN VITUSEOFTEN CALORIES FAT FIBER ALCOHOL CHOLESTEROL
## null 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 1 0 0 0
## 4 0 0 0 1 0 0 0
## 5 0 0 0 1 0 0 0
## 6 0 0 1 1 1 1 0
## 7 0 0 1 1 1 1 0
## 8 0 0 1 1 1 1 0
## 9 0 0 1 1 1 1 0
## 10 0 1 1 1 1 1 0
## 11 0 1 1 1 1 1 0
## 12 0 1 1 1 1 1 0
## BETADIET RETDIET SSEp R2 Ra2 Cp AIC
## null 0 0 29.60318 0.00000000 0.00000000 11.2430086 -537.6699
## 1 0 0 28.29976 0.04402980 0.04020592 1.7405317 -547.0171
## 2 0 0 27.73321 0.06316781 0.05564305 -1.2591400 -550.1132
## 3 0 0 27.31759 0.07720749 0.06604467 -2.9269074 -551.9184
## 4 1 0 27.13028 0.08353489 0.06869335 -2.5798972 -551.6522
## 5 1 0 27.09431 0.08474999 0.06614735 -0.8973332 -549.9866
## 6 0 0 27.05596 0.08604557 0.06366302 0.7642043 -548.3435
## 7 1 0 26.95609 0.08941904 0.06329582 1.8829083 -547.2754
## 8 1 0 26.91770 0.09071602 0.06078075 3.5440815 -545.6346
## 9 1 0 26.89402 0.09151586 0.05772926 5.3351291 -543.8564
## 10 1 0 26.87563 0.09213688 0.05446621 7.1728939 -542.0287
## 11 1 0 26.86247 0.09258152 0.05099151 9.0567336 -540.1521
## 12 1 1 26.85754 0.09274795 0.04719554 11.0132556 -538.1983
## BIC
## null -534.1405
## 1 -539.9583
## 2 -539.5249
## 3 -537.8006
## 4 -534.0051
## 5 -528.8100
## 6 -523.6375
## 7 -519.0400
## 8 -513.8697
## 9 -508.5621
## 10 -503.2049
## 11 -497.7990
## 12 -492.3158
which.best.sub <- data.frame(
Ra2 = which.max(criteria$Ra2),
Cp = which.min(criteria$Cp),
AIC = which.min(criteria$AIC),
BIC = which.min(criteria$BIC)
)
which.best.sub
## Ra2 Cp AIC BIC
## 1 5 4 4 2
rbind(criteria[5,],rbind(criteria[4,],criteria[2,]))
## (Intercept) AGE SEXMALE SMOKSTATFORMER SMOKSTATNEVER QUETELET VITUSENOT OFTEN
## 4 1 1 0 1 0 0 0
## 3 1 1 0 1 0 0 0
## 1 1 1 0 0 0 0 0
## VITUSEOFTEN CALORIES FAT FIBER ALCOHOL CHOLESTEROL BETADIET RETDIET SSEp
## 4 0 0 1 0 0 0 1 0 27.13028
## 3 0 0 1 0 0 0 0 0 27.31759
## 1 0 0 0 0 0 0 0 0 28.29976
## R2 Ra2 Cp AIC BIC
## 4 0.08353489 0.06869335 -2.579897 -551.6522 -534.0051
## 3 0.07720749 0.06604467 -2.926907 -551.9184 -537.8006
## 1 0.04402980 0.04020592 1.740532 -547.0171 -539.9583
By adjusted r squared, the model containing AGE, SMOKSTAT, FAT, BETADIET is the best. By Mallow’s Cp and AIC criterion, the model containing AGE, SMOKSTAT, FAT is the best. By BIC criteria which prefers a smaller model than AIC criteria, the model containing AGE is the best.
#best subset from adjusted r square
best.sub1 <- lm(log(RETPLASMA)~AGE+SMOKSTAT+FAT+BETADIET, data = retplasma_t)
summary(best.sub1)
##
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT + BETADIET,
## data = retplasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05877 -0.18077 0.00587 0.21069 0.97001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.223e+00 1.055e-01 58.994 < 2e-16 ***
## AGE 4.354e-03 1.450e-03 3.003 0.00295 **
## SMOKSTATFORMER 8.765e-02 6.647e-02 1.319 0.18853
## SMOKSTATNEVER -3.641e-02 6.372e-02 -0.571 0.56819
## FAT -1.107e-03 6.312e-04 -1.754 0.08063 .
## BETADIET -1.792e-05 1.454e-05 -1.232 0.21894
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3319 on 246 degrees of freedom
## Multiple R-squared: 0.08475, Adjusted R-squared: 0.06615
## F-statistic: 4.556 on 5 and 246 DF, p-value: 0.000543
par(mfrow=c(2,2))
plot(best.sub1)
#best subset from Cp and AIC
best.sub2 <- lm(log(RETPLASMA)~AGE+SMOKSTAT+FAT, data = retplasma_t)
summary(best.sub2)
##
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.10481 -0.18420 0.00993 0.21327 0.98907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.2067638 0.1047904 59.230 < 2e-16 ***
## AGE 0.0042786 0.0014500 2.951 0.00347 **
## SMOKSTATFORMER 0.0746569 0.0656984 1.136 0.25691
## SMOKSTATNEVER -0.0451414 0.0633872 -0.712 0.47704
## FAT -0.0012418 0.0006224 -1.995 0.04711 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3322 on 247 degrees of freedom
## Multiple R-squared: 0.0791, Adjusted R-squared: 0.06418
## F-statistic: 5.304 on 4 and 247 DF, p-value: 0.0004097
par(mfrow=c(2,2))
plot(best.sub2)
#best subset from BIC
best.sub3 <- lm(log(RETPLASMA)~AGE+FAT, data = retplasma_t)
summary(best.sub3)
##
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + FAT, data = retplasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1580 -0.1800 0.0038 0.2122 1.0648
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.186871 0.096875 63.865 < 2e-16 ***
## AGE 0.004414 0.001442 3.062 0.00244 **
## FAT -0.001013 0.000621 -1.631 0.10408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3353 on 249 degrees of freedom
## Multiple R-squared: 0.05414, Adjusted R-squared: 0.04654
## F-statistic: 7.126 on 2 and 249 DF, p-value: 0.0009783
par(mfrow=c(2,2))
plot(best.sub3)
step.sub1 <- stepAIC(retplasma_t_null, scope = list(upper=retplasma_t_full, lower=~1), direction = "both", k = 2, trace = FALSE)
summary(step.sub1) #It's the same result as the one we got from best subset selection from Cp and AIC
##
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.10481 -0.18420 0.00993 0.21327 0.98907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.2067638 0.1047904 59.230 < 2e-16 ***
## AGE 0.0042786 0.0014500 2.951 0.00347 **
## SMOKSTATFORMER 0.0746569 0.0656984 1.136 0.25691
## SMOKSTATNEVER -0.0451414 0.0633872 -0.712 0.47704
## FAT -0.0012418 0.0006224 -1.995 0.04711 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3322 on 247 degrees of freedom
## Multiple R-squared: 0.0791, Adjusted R-squared: 0.06418
## F-statistic: 5.304 on 4 and 247 DF, p-value: 0.0004097
retplasma_t_full2 <- lm(log(RETPLASMA)~.^2, data = retplasma_t)
step.sub2 <- stepAIC(retplasma_t_null, scope = list(upper=retplasma_t_full2, lower=~1), direction = "both", k = 2, trace = FALSE)
summary(step.sub2)
##
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.10481 -0.18420 0.00993 0.21327 0.98907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.2067638 0.1047904 59.230 < 2e-16 ***
## AGE 0.0042786 0.0014500 2.951 0.00347 **
## SMOKSTATFORMER 0.0746569 0.0656984 1.136 0.25691
## SMOKSTATNEVER -0.0451414 0.0633872 -0.712 0.47704
## FAT -0.0012418 0.0006224 -1.995 0.04711 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3322 on 247 degrees of freedom
## Multiple R-squared: 0.0791, Adjusted R-squared: 0.06418
## F-statistic: 5.304 on 4 and 247 DF, p-value: 0.0004097
# same as the step.sub1
#test on 3 best models so far
candidate_m1 <- lm(log(RETPLASMA)~AGE+SMOKSTAT+FAT+BETADIET, data = retplasma_t)
candidate_m2 <- lm(log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
candidate_m3 <- lm(log(RETPLASMA) ~ AGE + FAT, data = retplasma_t)
#find SSE for candidate_m1 and candidate_m2
sse.cm1 <- anova(candidate_m1)["Residuals",2]
sse.cm2 <- anova(candidate_m2)["Residuals",2]
sse.cm3 <- anova(candidate_m3)["Residuals",2]
sse.compare <- c(sse.cm1,sse.cm2,sse.cm3)
#find MSE for candidate_m1 and candidate_m2
mse.cm1 <- anova(candidate_m1)["Residuals",3]
mse.cm2 <- anova(candidate_m2)["Residuals",3]
mse.cm3 <- anova(candidate_m3)["Residuals",3]
mse.compare <- c(mse.cm1,mse.cm2, mse.cm3)
#find p for candidate_m1 and candidate_m2
p.cm1 <- 5
p.cm2 <- 4
p.cm3 <- 3
p.compare <- c(p.cm1,p.cm2, p.cm3)
#find Cp for candidate_m1 and candidate_m2
#sigma^2: MSE_fullmodel
#n.t: nrow(training data)
mse_full <- summary(retplasma_t_full)$sigma^2
cp.cm1 <- (sse.cm1/mse_full)-(n.t-2*p.cm1)
cp.cm2 <- (sse.cm2/mse_full)-(n.t-2*p.cm2)
cp.cm3 <- (sse.cm3/mse_full)-(n.t-2*p.cm3)
cp.compare <- c(cp.cm1,cp.cm2,cp.cm3)
#find Pressp for candidate_m1 and candidate_m2
press.cm1 <- sum(candidate_m1$residuals^2/(1-influence(candidate_m1)$hat)^2)
press.cm2 <- sum(candidate_m2$residuals^2/(1-influence(candidate_m2)$hat)^2)
press.cm3 <- sum(candidate_m3$residuals^2/(1-influence(candidate_m3)$hat)^2)
press.compare <- c(press.cm1,press.cm2,press.cm3)
compare <- data.frame(sse=sse.compare, mse=mse.compare, p=p.compare, cp=cp.compare, press=press.compare)
rownames(compare) <- c("candidate_m1","candidate_m2","candidate_m3")
compare
## sse mse p cp press
## candidate_m1 27.09431 0.1101395 5 -2.897333 28.65852
## candidate_m2 27.26162 0.1103709 4 -3.420886 28.40186
## candidate_m3 28.00048 0.1124517 3 1.099474 28.67201
##candidate_m1 on validation data
candidate_m1.v <- lm(candidate_m1, data = retplasma_v)
candidate_m2.v <- lm(candidate_m2, data = retplasma_v)
candidate_m3.v <- lm(candidate_m3, data = retplasma_v)
#summary on training data and validation data respectively
list(summary(candidate_m1), summary(candidate_m2), summary(candidate_m3))
## [[1]]
##
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT + BETADIET,
## data = retplasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05877 -0.18077 0.00587 0.21069 0.97001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.223e+00 1.055e-01 58.994 < 2e-16 ***
## AGE 4.354e-03 1.450e-03 3.003 0.00295 **
## SMOKSTATFORMER 8.765e-02 6.647e-02 1.319 0.18853
## SMOKSTATNEVER -3.641e-02 6.372e-02 -0.571 0.56819
## FAT -1.107e-03 6.312e-04 -1.754 0.08063 .
## BETADIET -1.792e-05 1.454e-05 -1.232 0.21894
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3319 on 246 degrees of freedom
## Multiple R-squared: 0.08475, Adjusted R-squared: 0.06615
## F-statistic: 4.556 on 5 and 246 DF, p-value: 0.000543
##
##
## [[2]]
##
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.10481 -0.18420 0.00993 0.21327 0.98907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.2067638 0.1047904 59.230 < 2e-16 ***
## AGE 0.0042786 0.0014500 2.951 0.00347 **
## SMOKSTATFORMER 0.0746569 0.0656984 1.136 0.25691
## SMOKSTATNEVER -0.0451414 0.0633872 -0.712 0.47704
## FAT -0.0012418 0.0006224 -1.995 0.04711 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3322 on 247 degrees of freedom
## Multiple R-squared: 0.0791, Adjusted R-squared: 0.06418
## F-statistic: 5.304 on 4 and 247 DF, p-value: 0.0004097
##
##
## [[3]]
##
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + FAT, data = retplasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1580 -0.1800 0.0038 0.2122 1.0648
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.186871 0.096875 63.865 < 2e-16 ***
## AGE 0.004414 0.001442 3.062 0.00244 **
## FAT -0.001013 0.000621 -1.631 0.10408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3353 on 249 degrees of freedom
## Multiple R-squared: 0.05414, Adjusted R-squared: 0.04654
## F-statistic: 7.126 on 2 and 249 DF, p-value: 0.0009783
list(summary(candidate_m1.v), summary(candidate_m2.v), summary(candidate_m3.v))
## [[1]]
##
## Call:
## lm(formula = candidate_m1, data = retplasma_v)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.83435 -0.18186 0.01204 0.19784 0.88838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.754e+00 2.321e-01 24.791 <2e-16 ***
## AGE 7.766e-03 3.191e-03 2.434 0.0181 *
## SMOKSTATFORMER 3.096e-01 1.427e-01 2.169 0.0343 *
## SMOKSTATNEVER 2.459e-01 1.404e-01 1.751 0.0853 .
## FAT 3.784e-04 1.319e-03 0.287 0.7752
## BETADIET -1.736e-06 2.848e-05 -0.061 0.9516
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3099 on 57 degrees of freedom
## Multiple R-squared: 0.1619, Adjusted R-squared: 0.08836
## F-statistic: 2.202 on 5 and 57 DF, p-value: 0.06655
##
##
## [[2]]
##
## Call:
## lm(formula = candidate_m2, data = retplasma_v)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8317 -0.1808 0.0135 0.1991 0.8911
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.7540061 0.2300812 25.009 <2e-16 ***
## AGE 0.0077207 0.0030743 2.511 0.0148 *
## SMOKSTATFORMER 0.3090616 0.1411968 2.189 0.0326 *
## SMOKSTATNEVER 0.2446627 0.1377798 1.776 0.0810 .
## FAT 0.0003715 0.0013026 0.285 0.7765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3072 on 58 degrees of freedom
## Multiple R-squared: 0.1618, Adjusted R-squared: 0.104
## F-statistic: 2.8 on 4 and 58 DF, p-value: 0.03404
##
##
## [[3]]
##
## Call:
## lm(formula = candidate_m3, data = retplasma_v)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8036 -0.2095 0.0399 0.1953 0.8815
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.027e+00 1.948e-01 30.941 <2e-16 ***
## AGE 7.663e-03 3.128e-03 2.449 0.0172 *
## FAT 4.376e-05 1.322e-03 0.033 0.9737
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3143 on 60 degrees of freedom
## Multiple R-squared: 0.09255, Adjusted R-squared: 0.06231
## F-statistic: 3.06 on 2 and 60 DF, p-value: 0.05428
#percent change in parameter estimation
pct_chg.coef <- function(model, modelv, digit){
coef.m <- coef(model)
coef.v <- coef(modelv)
pct.chg.coef <- round(abs(coef.v-coef.m)/abs(coef.m)*100, digit)
pct.chg.coef
}
pct_chg.coef.cm1 <- pct_chg.coef(candidate_m1, candidate_m1.v, 2)
pct_chg.coef.cm2 <- pct_chg.coef(candidate_m2, candidate_m2.v, 2)
pct_chg.coef.cm3 <- pct_chg.coef(candidate_m3, candidate_m3.v, 2)
#percent change in standard error
pct_chg.sd <- function(model, modelv, digit){
sd.m <- summary(model)$coefficients[, "Std. Error"]
sd.v <- summary(modelv)$coefficients[, "Std. Error"]
pct.chg.sd <- round(abs(sd.v-sd.m)/abs(sd.m)*100, digit)
pct.chg.sd
}
pct_chg.sd.cm1 <- pct_chg.sd(candidate_m1, candidate_m1.v, 2)
pct_chg.sd.cm2 <- pct_chg.sd(candidate_m2, candidate_m2.v, 2)
pct_chg.sd.cm3 <- pct_chg.sd(candidate_m3, candidate_m3.v, 2)
pct_chg_beta_cm1 <- data.frame(pct_chg.coef.cm1,pct_chg.sd.cm1)
pct_chg_beta_cm1
## pct_chg.coef.cm1 pct_chg.sd.cm1
## (Intercept) 7.53 120.05
## AGE 78.38 120.08
## SMOKSTATFORMER 253.28 114.75
## SMOKSTATNEVER 775.24 120.34
## FAT 134.18 108.93
## BETADIET 90.32 95.87
pct_chg_beta_cm2 <- data.frame(pct_chg.coef.cm2,pct_chg.sd.cm2)
pct_chg_beta_cm2
## pct_chg.coef.cm2 pct_chg.sd.cm2
## (Intercept) 7.29 119.56
## AGE 80.45 112.03
## SMOKSTATFORMER 313.98 114.92
## SMOKSTATNEVER 641.99 117.36
## FAT 129.92 109.30
pct_chg_beta_cm3 <- data.frame(pct_chg.coef.cm3,pct_chg.sd.cm3)
pct_chg_beta_cm3
## pct_chg.coef.cm3 pct_chg.sd.cm3
## (Intercept) 2.58 101.08
## AGE 73.59 116.96
## FAT 104.32 112.85
colnames(pct_chg_beta_cm1) <- c("change in coefficient(%)","change in standard deviation(%)" )
colnames(pct_chg_beta_cm2) <- c("change in coefficient(%)","change in standard deviation(%)" )
colnames(pct_chg_beta_cm3) <- c("change in coefficient(%)","change in standard deviation(%)" )
#mean squared prediction error
mspe <- function(model, dv, data){
yhat <- predict(model, newdata = data)
y <- data[[dv]]
mean((y-yhat)^2)
}
mspe.cm1 <- mspe(candidate_m1, "RETPLASMA", retplasma_v)
mspe.cm2 <- mspe(candidate_m2, "RETPLASMA", retplasma_v)
mspe.cm3 <- mspe(candidate_m3, "RETPLASMA", retplasma_v)
#compare with Pressp/n and SSEp/n
candidate_m1.compare <- c(mspe.cm1,press.cm1/n.t,mse.cm1)
candidate_m2.compare <- c(mspe.cm2,press.cm2/n.t,mse.cm2)
candidate_m3.compare <- c(mspe.cm3,press.cm3/n.t,mse.cm3)
compare2 <-data.frame(candidate_m1.compare, candidate_m2.compare, candidate_m3.compare)
compare2 <- as.data.frame(t(compare2))
colnames(compare2) <- c("MSPE", "Press/n", "MSE")
rownames(compare2) <- c("candidate_m1","candidate_m2","candidate_m3")
compare2
## MSPE Press/n MSE
## candidate_m1 435048.0 0.1137243 0.1101395
## candidate_m2 435048.2 0.1127058 0.1103709
## candidate_m3 435050.0 0.1137778 0.1124517
Choose candidate_m2: Age, Smokstat, Fat
candidate_m2_final <- lm(candidate_m2, data = retplasma)
summary(candidate_m2_final)
##
## Call:
## lm(formula = candidate_m2, data = retplasma)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1368 -0.1846 0.0036 0.2103 0.9818
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.1547933 0.0949166 64.844 < 2e-16 ***
## AGE 0.0045494 0.0013053 3.485 0.000562 ***
## SMOKSTATFORMER 0.1079461 0.0593416 1.819 0.069866 .
## SMOKSTATNEVER 0.0031563 0.0575199 0.055 0.956275
## FAT -0.0010254 0.0005617 -1.826 0.068887 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3289 on 310 degrees of freedom
## Multiple R-squared: 0.077, Adjusted R-squared: 0.06509
## F-statistic: 6.465 on 4 and 310 DF, p-value: 5.228e-05
anova(candidate_m2_final)
## Analysis of Variance Table
##
## Response: log(RETPLASMA)
## Df Sum Sq Mean Sq F value Pr(>F)
## AGE 1 1.720 1.71983 15.8984 8.341e-05 ***
## SMOKSTAT 2 0.717 0.35854 3.3144 0.03765 *
## FAT 1 0.360 0.36049 3.3325 0.06889 .
## Residuals 310 33.535 0.10818
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,1))
plot(candidate_m2_final, which=c(1,2))
#check outliers in Y
#n <- nrow(totaldata)
n <- nrow(retplasma)
std.d.res <- studres(candidate_m2_final)
p.res <- 4
bon_thred <- qt(1-0.1/(2*n), n-p.res)
outlier.Y <- as.vector(which(abs(std.d.res) >= bon_thred))
outlier.Y
## integer(0)
hii <- influence(candidate_m2_final)$hat
#check outliers in X
outlier.X <- as.vector(which(hii>(2*p.res/n)))
outlier.X
## [1] 32 33 42 44 50 62 68 75 82 88 89 95 100 101 109 111 122 124 134
## [20] 144 152 170 180 181 195 202 212 215 220 222 223 239 257 266 269 274 280 282
## [39] 289 290 296 302 305 308
plot(hii, residuals(candidate_m2_final), xlab = "leverage", ylab="residuals")
#influence index plot
plot(candidate_m2_final, which = 4)
candidate_m2_final2 <- lm(candidate_m2_final, data = retplasma[-c(36,296),])
f1 <- fitted(candidate_m2_final)
f2 <- fitted(candidate_m2_final2)
SUM<-sum(abs((f1[-c(36,296)]-f2)/f1[-c(36,296)]))
SUM<-SUM+abs((f1[c(36,296)]-predict(candidate_m2_final,newdata = retplasma[c(36,296),]))/f1[c(36,296)])
per.average <- SUM/n
per.average
## 36 296
## 0.0004030076 0.0004030076
summary(candidate_m2)
##
## Call:
## lm(formula = log(RETPLASMA) ~ AGE + SMOKSTAT + FAT, data = retplasma_t)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.10481 -0.18420 0.00993 0.21327 0.98907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.2067638 0.1047904 59.230 < 2e-16 ***
## AGE 0.0042786 0.0014500 2.951 0.00347 **
## SMOKSTATFORMER 0.0746569 0.0656984 1.136 0.25691
## SMOKSTATNEVER -0.0451414 0.0633872 -0.712 0.47704
## FAT -0.0012418 0.0006224 -1.995 0.04711 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3322 on 247 degrees of freedom
## Multiple R-squared: 0.0791, Adjusted R-squared: 0.06418
## F-statistic: 5.304 on 4 and 247 DF, p-value: 0.0004097
The potential influential cases identified previously is the 36th and 296th cases, we fit the model without 36th and 296th cases and calculate the average absolute difference in the fitted values as 0.040% respectively. For 36th and 296th cases, the percentage change on the fitted value with or without the case is very small. Therefore, no case have an unduly large influence on prediction and thus all cases may be retained.